import re  # 用于正则表达式操作
import pandas as pd  # 用于数据处理，读取CSV文件
import numpy as np  # 用于数值计算
import jieba  # 用于中文分词
import xgboost as xgb  # 用于XGBoost模型
import warnings  # 用于警告处理,在很多情况下是为了让输出更简洁
warnings.filterwarnings('ignore')  # 不显示警告
from sklearn.feature_extraction.text import CountVectorizer  # 用于文本特征提取
from sklearn.ensemble import VotingClassifier  # 用于投票分类器
from sklearn.model_selection import StratifiedKFold, train_test_split  # 用于交叉验证和数据集划分
from sklearn.linear_model import LogisticRegression  # 用于逻辑回归模型
from sklearn.tree import DecisionTreeClassifier#决策树
from sklearn.metrics import accuracy_score
 
def text_predeal(temp):
    temp = re.sub('[^\u4e00-\u9fa5aA-Za-z0-9，。？：！；“”]', ' ', temp)  # 保留中文、英文、数字以及一些标点符号
    temp = temp.replace('网站', '')  # 去除特定的词汇
    temp = re.sub(re.compile('<.*?>'), ' ', temp)  # 去除HTML标签
    temp = temp.strip()  # 去除首尾空白字符
    return temp

def jiebafenci(sentences, stop_words):
    words = list(jieba.cut(sentences))  # 使用jieba库进行中文分词
    filtered_words = [word for word in words if word not in stop_words and word != ' ']  # 去除停用词
    return " ".join(filtered_words)  # 将分词结果拼接成字符串

def init_data():
    train_data = pd.read_csv('/data/sfq/oco/train.csv', sep='\t')
    test_data = pd.read_csv('/data/sfq/oco/test_new.csv', sep=',')
    # 对文本进行预处理，去除一些非文字符
    train_data['comment']=train_data['comment'].apply(lambda x:text_predeal(x))
    test_data['comment']=test_data['comment'].apply(lambda x:text_predeal(x))    # 对文本进行分词和去除停用词,加载停用词表,百度停用词表
    stopfile_path = '/data/sfq/oco/stopwords.txt'   # 中英文停用词
    stop_words=[]
    with open(stopfile_path, 'r', encoding='utf-8') as f:
        for temp_text in f.readlines():
            temp_text=temp_text.strip('\n')
            if temp_text=='':
                continue
            else:
                stop_words.append(temp_text)
 
    # 对评论文本进行分词和去除停用词
    train_temp = [jiebafenci(knob, stop_words) for knob in train_data['comment'].values]
    test_temp = [jiebafenci(knob, stop_words) for knob in test_data['comment'].values]
 
    # 将文本数据转换成数值向量，表示训练数据中每个词语出现的次数，表示不同词语的频率分布
    # min_df=10：过滤掉出现频率过低的词语
    # ngram_range=(1, 1)：只考虑单个词语
    # token_pattern=r'\b\w+\b'：指定词语的匹配模式
    vector = CountVectorizer(min_df=5, ngram_range=(1, 1), token_pattern=r'\b\w+\b')
    # 使用 fit 方法训练词袋模型
    vector.fit(train_temp + test_temp)
    # 将训练集和测试集文本转化为矩阵形式
    lable_train = np.array(train_data['label'].tolist())
    matrix_train = vector.transform(train_temp).toarray()
    print(matrix_train)
    matrix_test = vector.transform(test_temp).toarray()
    return matrix_train, lable_train, matrix_test, test_data

def K_flod_cross_validation():
    skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=2023).split(matrix_train, lable_train)
    y_test_preds = np.zeros((len(matrix_test), 1))
 
    # 初始化不同分类器的实例
    logistic_cf = LogisticRegression(C=1.2)  # 逻辑回归模型，将正则化强度的倒数设为1.2
    detree_cf = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_leaf=1,
                                       ccp_alpha=0.0)  # 决策树模型,ccp_alpha用于代价复杂性修剪的参数。可以限制分支的数量，以避免过拟合。
    xgboost_cf = xgb.XGBClassifier(tree_method='gpu_hist')  # XGBoost模型
    vote_cf = VotingClassifier(estimators=[('lr', logistic_cf), ('drc', detree_cf), ('xgb', xgboost_cf)],
                               voting='hard')  # 集成模型
    for i, (train_set, verify_set) in enumerate(skf):
        xsubset_train, ysubset_train = matrix_train[train_set], lable_train[train_set]
        # 在单个分类器上进行训练和预测
        for cf in (logistic_cf, detree_cf, xgboost_cf):
            cf.fit(xsubset_train, ysubset_train)  # 在训练集上训练分类器
        vote_model = vote_cf
        vote_model.fit(xsubset_train, ysubset_train)
        temp = vote_model.predict(matrix_test)
        y_test_preds += temp.reshape(-1, 1) / 5
        y_verify_pred = vote_model.predict(matrix_train[verify_set])
        accuracy = accuracy_score(lable_train[verify_set], y_verify_pred)
        print(f"Fold {i + 1} Accuracy: {accuracy:8f}")
    return y_test_preds
 
 
if __name__ == '__main__':
    matrix_train, lable_train, matrix_test, data = init_data()
    pre_test = K_flod_cross_validation()
    pre_test=list(map(lambda x: 1 if x >= 0.5 else 0, pre_test))
    result = data.copy()
    result['label'] = pre_test
    result[['id', 'label']].to_csv('./result.csv', index=None)
 